--- title: Core keywords: fastai sidebar: home_sidebar summary: "Core functionality for the fastai audio library." description: "Core functionality for the fastai audio library." ---
This section regroups the basic types used in vision with the transform that create objects of those types.
p = untar_data(URLs.SPEAKERS10, extract_func=tar_extract_at_filename)
audio_getter = AudioGetter("", recurse=True, folders=None)
files = audio_getter(p)#files will load differently on different machines so we specify examples by name
ex_files = [p/f for f in ['m0005_us_m0005_00218.wav',
'f0003_us_f0003_00279.wav',
'f0001_us_f0001_00168.wav',
'f0005_us_f0005_00286.wav',]]
Patch on indexing to retain the AudioTensor type, so when indexing it stays the same type
AudioTensor(torch.ones(10), sr=100)
item0 = AudioTensor.create(ex_files[0])
item0.shape
item0.sr, item0.nchannels, item0.nsamples, item0.duration
test_eq(type(item0.data), torch.Tensor)
test_eq(item0.sr, 16000)
test_eq(item0.nchannels, 1)
test_eq(item0.nsamples, 58240)
test_eq(item0.duration, 3.64)
item0[0]
item0.show()
item1 = AudioTensor.create(files[1]);
item0.show()
item1.show()
#get 3 equal length portions of 3 different signals so we can stack them
#for a fake multichannel example
ai0, ai1, ai2 = map(AudioTensor.create, ex_files[1:4]);
min_samples = min(ai0.nsamples, ai1.nsamples, ai2.nsamples)
s0, s1, s2 = map(lambda x: x[:,:min_samples], (ai0, ai1, ai2))
tst0 = AudioTensor(torch.ones(10), sr=120)
tst1 = AudioTensor(torch.ones(10), sr=150)
(tst0 + tst1).sr
test_eq(s0.shape, s1.shape)
test_eq(s1.shape, s2.shape)
fake_multichannel = AudioTensor(torch.stack((s0, s1, s2), dim=1).squeeze(0), sr=16000)
test_eq(fake_multichannel.nchannels, 3)
test_eq(fake_multichannel.nsamples, 53760)
fake_multichannel.show()
repr of Transform is:
classname: self.use_as_item {self.encodes} {self.decodes}
encodes and decodes are TypeDispatches whose reprs are str of dict where k/v pair is typename and function that handles that type
oa = OpenAudio(files); oa
#demonstrate functionality of OpenAudio.encodes, the rest of the nb will
#use files that are opened by name for reproducibility/testing
oa = OpenAudio(files)
item100 = oa.encodes(100)
item100.show()
#test open audio on a random set of files
for i in range(10):
idx = random.randint(0, len(files)-1)
test_eq_type(oa.encodes(idx), AudioTensor.create(files[idx]))
test_eq_type(oa.decodes(idx), files[idx])
type(oa)
oa.encodes(0)
oa.decodes(0)
oa.items[0]
function_list += f(**usable_kwargs) only works if all args are keyword arguments, doesnt work for unnamed args. Could add in a get usable args that checks if default is inspect._empty. This also needs more testsget_usable_kwargs takes a function and a dictionary of kwargs that may or may not be relevant to that function and returns a dictionary of all the default values to that function, updated with the kwargs that can be successfully applied. This is done because, first it allows us to combine multiple functions into a single AudioToSpec Transform but only pass the appropriate kwargs, secondly because it allows us to keep a dictionary of the settings used to create the Spectrogram which is sometimes used in it's display and cropping, and third because it allows us to warn the user when they are passing in improper or unused kwargs.
Example: testing with a function that only takes a and b as kwargs
def test_kwargs(a:int=10, b:int=20): pass
kwargs = {'a':1, 'b':2}
extra_kwargs = {'z':0, 'a':1, 'b':2, 'c':3}
test_eq(get_usable_kwargs(test_kwargs, kwargs ), kwargs)
test_eq(get_usable_kwargs(test_kwargs, extra_kwargs, []), kwargs)
item0 = AudioTensor.create(ex_files[0])
DBMelSpec = SpectrogramTransformer(mel=True, to_db=True)
a2s = DBMelSpec(n_fft=2048, hop_length=128, n_mels=64, baloney="hi")
type(item0)
item0.__dict__
<<<<<<< HEAD
a2s
sg = a2s(item0)
item0.__dict__
type(item0)
item0.shape
sg.shape
sg.settings
sg.show()
=======
sg.show()
>>>>>>> master
# get a sg with weird settings for testing
item0 = AudioTensor.create(ex_files[0])
item1 = AudioTensor.create(ex_files[1])
a2s = DBMelSpec(f_max = 20000, n_mels=137)
sg = a2s(item0)
sg1 = a2s(item1)
sg.show()
sg1.show()
sg_mc = a2s(fake_multichannel)
sg_mc.show()
sg.shape
sg._settings
sg.nchannels, sg.height, sg.width
#test the explicit settings were properly stored in the spectrogram object and can be accessed as attributes
test_eq(sg.f_max, 20000)
test_eq(sg.hop_length, 512)
test_eq(sg.sr, item100.sr)
test_eq(sg.mel, True)
test_eq(sg.to_db, True)
test_eq(sg.nchannels, 1)
test_eq(sg.height, 137)
test_eq(sg.n_mels, sg.height)
test_eq(sg.width, 114)
defaults = {k:v.default for k, v in inspect.signature(_GenMelSpec).parameters.items()}
a2s = DBMelSpec(f_max =20000, hop_length=345)
sg = a2s(item100)
test_eq(sg.n_mels, defaults["n_mels"])
test_eq(sg.n_fft , 1024)
test_eq(sg.shape[1], sg.n_mels)
test_eq(sg.hop_length, 345)
# test the spectrogram and audio have same duration, both are computed
# on the fly as transforms can change their duration
test_close(sg.duration, item100.duration, eps=0.1)
a2s_5hz = DBMelSpec(
sample_rate=16000,
n_fft=1024,
win_length=1024,
hop_length=512,
f_min=0.0,
f_max=20000,
pad=0,
n_mels=137,
)
sine_5hz = torch.Tensor([0.5 * np.cos(2 * np.pi * 5 * np.arange(0, 1.0, 1.0/16000))])
at_5hz = AudioTensor(sine_5hz, 16000)
sg_5hz = a2s_5hz(at_5hz)
sg_5hz.show()
# testing to make sure the lowest bin of the spectrogram has the highest value/most energy
max_row = sg_5hz.max(dim=1).indices.mode().values.item()
assert max_row < 2
SHOW_W=True
#test warning for unused argument 'power' for melspec
#tests AudioToSpec and its from_cfg class method
voice_mel_cfg = {'mel':True, 'to_db':True, 'n_fft':2560, 'f_max':22050., 'n_mels':128, 'hop_length':256, 'power':2}
test_warns(lambda: AudioToSpec.from_cfg(voice_mel_cfg), show=SHOW_W)
test_warns(lambda: DBMelSpec(power=2, n_fft=2560, f_max=22050, n_mels=128), show=SHOW_W)
#test for unused arguments 'f_max' and 'n_mels' for non-mel Spectrogram
voice_mel_cfg = {'mel':False, 'to_db':True, 'f_max':22050., 'n_mels':128, 'n_fft':2560, 'hop_length':256, 'power':2}
test_warns(lambda: AudioToSpec.from_cfg(voice_mel_cfg), show=SHOW_W)
#test warning for unused argument 'top_db' when db conversion not done
voice_mel_cfg = {'mel':True, 'to_db':False, 'top_db':20, 'n_fft':2560, 'f_max':22050., 'n_mels':128, 'hop_length':256}
test_warns(lambda: AudioToSpec.from_cfg(voice_mel_cfg), show=SHOW_W)
#test warning for invalid argument 'doesntexist'
voice_mel_cfg = {'mel':True, 'to_db':True,'doesntexist':True, 'n_fft':2560, 'f_max':22050., 'n_mels':128, 'hop_length':256}
test_warns(lambda: AudioToSpec.from_cfg(voice_mel_cfg), show=SHOW_W)
a_to_db_mel = SpectrogramTransformer()()
a_to_nondb_mel = SpectrogramTransformer(to_db=False)()
a_to_db_nonmel = SpectrogramTransformer(mel=False)()
a_to_nondb_non_mel = SpectrogramTransformer(mel=False, to_db=False)()
a_to_db_mel_hyperparams = SpectrogramTransformer()(n_fft=8192, hop_length=128)
%%timeit -n10
a_to_db_mel(item0)
%%timeit -n10
a_to_nondb_mel(item0)
%%timeit -n10
a_to_nondb_mel(item0)
%%timeit -n10
a_to_db_nonmel(item0)
%%timeit -n10
a_to_nondb_non_mel(item0)
%%timeit -n10
# Time can blow up as a factor of n_fft and hop_length. n_fft is best kept to a power of two, hop_length
# doesn't matter except smaller = more time because we have more chunks to perform STFTs on
a_to_db_mel_hyperparams(item0)
import time
def time_variable_length_audios(f, max_seconds=30, sr=16000, channels=1):
times = []
audios = [AudioTensor(torch.randn(channels, sr*i), sr) for i in range(1,max_seconds+1,2)]
for a in audios:
start = time.time()
out = f(a)
end = time.time()
times.append(round(1000*(end-start), 2))
return times
%%time
a2s = SpectrogramTransformer()()
max_seconds = 180
times_mono = time_variable_length_audios(f=a2s, max_seconds=max_seconds)
times_stereo = time_variable_length_audios(f=a2s, max_seconds=max_seconds, channels=2)
plt.plot(np.arange(0,max_seconds,2), times_mono, label="mono")
plt.plot(np.arange(0,max_seconds,2), times_stereo, label="stereo")
plt.legend(['mono','stereo'])
plt.title("Time Taken by AudioToSpec")
plt.xlabel("Audio Duration in Seconds")
plt.ylabel("Processing Time in ms")
item0 = AudioTensor.create(ex_files[0])
a2mfcc = AudioToMFCC()
mfcc = a2mfcc(item0)
test_eq(mfcc.n_mfcc, mfcc.data.shape[1])
mfcc.show()
mfcc._settings
mfcc.height
mfcc.width
#n_mfcc specified should determine the height of the mfcc
item1 = AudioTensor.create(ex_files[1])
n_mfcc = 67
a2mfcc67 = AudioToMFCC(n_mfcc=n_mfcc)
mfcc67 = a2mfcc67(item1)
test_eq(mfcc67.shape[1], n_mfcc)
print(mfcc67.shape)
mfcc67.show()
a2mfcc_kwargs = AudioToMFCC(melkwargs={"hop_length":1024, "n_fft":1024})
mfcc_kwargs = a2mfcc_kwargs(item1)
mfcc_kwargs.show()
# make sure a new hop_length changes the resulting width
test_ne(mfcc_kwargs.width, mfcc.width)
%%time
a2mfcc = AudioToMFCC()
max_seconds = 180
times_mono = time_variable_length_audios(f=a2mfcc, max_seconds=max_seconds)
times_stereo = time_variable_length_audios(f=a2mfcc, max_seconds=max_seconds, channels=2)
plt.plot(np.arange(0,max_seconds,2), times_mono, label="mono")
plt.plot(np.arange(0,max_seconds,2), times_stereo, label="stereo")
plt.legend(['mono','stereo'])
plt.title("Time Taken by AudioToMFCC")
plt.xlabel("Audio Duration in Seconds")
plt.ylabel("Processing Time in ms")
mel_cfg = {'n_fft':2560,'hop_length':64}
oa = OpenAudio(files)
a2s = DBMelSpec(**mel_cfg)
db_mel_pipe = Pipeline([oa,a2s])
for i in range(5):
print("Shape:", db_mel_pipe(i).shape)
db_mel_pipe.show(db_mel_pipe(i))
cfg = {'mel':False, 'to_db':False, 'hop_length':128, 'n_fft':400}
oa = OpenAudio(files)
a2s = AudioToSpec.from_cfg(cfg)
db_mel_pipe = Pipeline([oa, a2s])
for i in range(3):
print("Shape:", db_mel_pipe(i).shape)
db_mel_pipe.show(db_mel_pipe(i))
test_eq(db_mel_pipe(i).hop_length, cfg["hop_length"])
oa = OpenAudio(files)
a2s = SpectrogramTransformer(mel=False)()
db_mel_pipe = Pipeline([oa, a2s])
for i in range(3):
print("Shape:", db_mel_pipe(i).shape)
db_mel_pipe.show(db_mel_pipe(i))
#non-mel db-scale spectrogram, warning is expected as f_max is an argument to melspectrograms
cfg = {'mel':False, 'to_db':True, 'n_fft':260, 'f_max':22050., 'hop_length':128}
oa = OpenAudio(files)
a2s = AudioToSpec.from_cfg(cfg)
db_mel_pipe = Pipeline([oa, a2s])
for i in range(3):
db_mel_pipe.show(db_mel_pipe(i))
db_mfcc_pipe = Pipeline([oa, AudioToMFCC(n_mfcc=40),])
for i in range(3):
db_mfcc_pipe.show(db_mfcc_pipe(i))
<<<<<<< HEAD
oa(42)
# Basic Mel Spectrogram is just the Torchaudio defaults, which are currently bad, hence
# the empty melbins in the spectrogram below. We can make our own custom good ones like Voice
mel_cfg = AudioConfig.BasicMelSpectrogram()
a2mel = AudioToSpec.from_cfg(mel_cfg)
item0 = AudioTensor.create(ex_files[0])
mel_bad = a2mel(item0)
mel_bad.show()
voice_cfg = AudioConfig.Voice()
a2mel = AudioToSpec.from_cfg(voice_cfg)
mel_good = a2mel(oa(42))
mel_good.show()
=======
# Basic Mel Spectrogram is just the Torchaudio defaults, which are currently bad, hence
# the empty melbins in the spectrogram below. We can make our own custom good ones like Voice
mel_cfg = AudioConfig.BasicMelSpectrogram()
a2mel = AudioToSpec.from_cfg(mel_cfg)
item0 = AudioTensor.create(ex_files[0])
mel_bad = a2mel(item0)
mel_bad.show()
voice_cfg = AudioConfig.Voice()
a2mel = AudioToSpec.from_cfg(voice_cfg)
mel_good = a2mel(oa(42))
mel_good.show()
>>>>>>> master
test_eq(mel_bad.n_fft, mel_cfg.n_fft)
# hop defaults to None in torchaudio but is set later in the code, we override this default to None
# internally in AudioToSpec to ensure the correct hop_length is stored as a sg attribute
test_ne(mel_bad.hop_length, mel_cfg.hop_length)
print("MelConfig Default Hop:", mel_cfg.hop_length)
print("Resulting Hop:",mel_bad.hop_length)
sg_cfg = AudioConfig.BasicSpectrogram()
# make sure mel setting is passed down and is false for normal spectro
test_eq(sg_cfg.mel, False)
#Grab a random file, test that the n_fft are passed successfully via config and stored in sg settings
oa = OpenAudio(files)
f_num = random.randint(0, len(files))
sg_cfg = AudioConfig.BasicSpectrogram(n_fft=2000, hop_length=155)
a2sg = AudioToSpec.from_cfg(sg_cfg)
sg = a2sg(oa(f_num))
test_eq(sg.n_fft, sg_cfg.n_fft)
test_eq(sg.width, int(oa(f_num).nsamples/sg_cfg.hop_length)+1)
oa = OpenAudio(files)
db_mel_pipe = Pipeline([oa, AudioToSpec.from_cfg(sg_cfg)])
for i in range(3):
db_mel_pipe.show(db_mel_pipe(i))
voice_config = AudioConfig.Voice(); voice_config
oa = OpenAudio(files)
db_mel_pipe = Pipeline([oa, AudioToSpec.from_cfg(voice_config)])
for i in range(3):
db_mel_pipe.show(db_mel_pipe(i))
mfcc_cfg = AudioConfig.BasicMFCC()
oa = OpenAudio(files)
mfcc_pipe = Pipeline([oa, AudioToMFCC.from_cfg(mfcc_cfg)])
for i in range(44,47):
print("Shape", mfcc_pipe(i).shape)
mfcc_pipe(i).show()